Using R to Analyze COVID-19 Data

This .Rmd file, as well as the R file found in the same folder, contain the same overall R code. However, this file was created to gain experience using and written .Rmd files. It takes in a csv file of COVID-19 Data, such as positive cases, tests, and deaths.

Topics used in this file include: - Reading csv Files - Data Frames - Using the dplyr package - Filtering and summarizing Data - Creating plots using plotly and ggplot2 - Creating matrices

The csv file used in this file was retrieved from the following link: https://www.kaggle.com/datasets/lin0li/covid19testing?resource=download

First, the necessary libraries were listed.

# Libraries and packages that will be used
library(dplyr)
library(tibble)

library(ggplot2)
library(plotly)

Next, the csv file was donwload and stored. It was read in as a data frame.

# Get csv data
csv_name <- "tested_worldwide.csv"

covid_data <- read.csv(csv_name)

# Display the head and tail of the data
head(covid_data)
##         Date Country_Region Province_State positive active hospitalized
## 1 2020-01-16        Iceland     All States        3     NA           NA
## 2 2020-01-17        Iceland     All States        4     NA           NA
## 3 2020-01-18        Iceland     All States        7     NA           NA
## 4 2020-01-20    South Korea     All States        1     NA           NA
## 5 2020-01-22  United States     All States        0     NA           NA
## 6 2020-01-22  United States  Massachusetts        0     NA           NA
##   hospitalizedCurr recovered death total_tested daily_tested daily_positive
## 1               NA        NA    NA           NA           NA             NA
## 2               NA        NA    NA           NA           NA              1
## 3               NA        NA    NA           NA           NA              3
## 4               NA        NA    NA            4           NA             NA
## 5               NA        NA     0            0           NA             NA
## 6               NA        NA     0            0           NA             NA

A summary table of the data was also created.

# Show summary table of data
summary(covid_data)
##      Date           Country_Region     Province_State        positive      
##  Length:27641       Length:27641       Length:27641       Min.   :      0  
##  Class :character   Class :character   Class :character   1st Qu.:    635  
##  Mode  :character   Mode  :character   Mode  :character   Median :   8044  
##                                                           Mean   :  89042  
##                                                           3rd Qu.:  52812  
##                                                           Max.   :9761481  
##                                                           NA's   :4242     
##      active        hospitalized   hospitalizedCurr    recovered     
##  Min.   :   -10   Min.   :    0   Min.   :    0.0   Min.   :     0  
##  1st Qu.:   118   1st Qu.:  553   1st Qu.:   37.0   1st Qu.:   476  
##  Median :  2332   Median : 2592   Median :  280.0   Median :  3159  
##  Mean   : 19030   Mean   : 7495   Mean   :  956.7   Mean   : 25775  
##  3rd Qu.: 15102   3rd Qu.: 8199   3rd Qu.:  808.0   3rd Qu.: 22167  
##  Max.   :558636   Max.   :89995   Max.   :39055.0   Max.   :811330  
##  NA's   :9833     NA's   :19231   NA's   :13080     NA's   :9626    
##      death         total_tested        daily_tested      daily_positive  
##  Min.   :     0   Min.   :        0   Min.   :-1243606   Min.   :-15363  
##  1st Qu.:     9   1st Qu.:    32191   1st Qu.:     473   1st Qu.:     7  
##  Median :   163   Median :   202654   Median :    3107   Median :   135  
##  Mean   :  3074   Mean   :  1485408   Mean   :   19085   Mean   :  1025  
##  3rd Qu.:  1348   3rd Qu.:   844982   3rd Qu.:   11127   3rd Qu.:   659  
##  Max.   :229238   Max.   :136620652   Max.   : 3760260   Max.   :128396  
##  NA's   :4010     NA's   :912         NA's   :1174       NA's   :4557

Another number of different functions were used to gain exposure into how they work and to get key dimensions and features of the data frame.

# List the column names
colnames(covid_data)
##  [1] "Date"             "Country_Region"   "Province_State"   "positive"        
##  [5] "active"           "hospitalized"     "hospitalizedCurr" "recovered"       
##  [9] "death"            "total_tested"     "daily_tested"     "daily_positive"
# Use glimpse to display data
glimpse(covid_data)
## Rows: 27,641
## Columns: 12
## $ Date             <chr> "2020-01-16", "2020-01-17", "2020-01-18", "2020-01-20…
## $ Country_Region   <chr> "Iceland", "Iceland", "Iceland", "South Korea", "Unit…
## $ Province_State   <chr> "All States", "All States", "All States", "All States…
## $ positive         <int> 3, 4, 7, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, NA, NA, NA,…
## $ active           <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalized     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalizedCurr <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ recovered        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death            <int> NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA…
## $ total_tested     <dbl> NA, NA, NA, 4, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, NA, NA,…
## $ daily_tested     <int> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 5, 0, 0, 0, NA, …
## $ daily_positive   <int> NA, 1, 3, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA…
# Show list of unique countries listed in the Data
unique(covid_data$Country_Region)
##   [1] "Iceland"                          "South Korea"                     
##   [3] "United States"                    "Australia"                       
##   [5] "United Kingdom"                   "Israel"                          
##   [7] "Czechia"                          "Canada"                          
##   [9] "Russia"                           "Armenia"                         
##  [11] "Poland"                           "Italy"                           
##  [13] "Estonia"                          "Greece"                          
##  [15] "Lithuania"                        "Belgium"                         
##  [17] "New Zealand"                      "Sweden"                          
##  [19] "Latvia"                           "Costa Rica"                      
##  [21] "Serbia"                           "Slovakia"                        
##  [23] "Bangladesh"                       "Turkey"                          
##  [25] "Kazakhstan"                       "Palestine"                       
##  [27] "Brazil"                           "Bolivia"                         
##  [29] "Grenada"                          "Spain"                           
##  [31] "Ukraine"                          "Germany"                         
##  [33] "Iran"                             "France"                          
##  [35] "Ireland"                          "Uruguay"                         
##  [37] "Egypt"                            "Singapore"                       
##  [39] "Netherlands"                      "Argentina"                       
##  [41] "Bahrain"                          "Chile"                           
##  [43] "Jamaica"                          "Japan"                           
##  [45] "Malaysia"                         "Malta"                           
##  [47] "Panama"                           "Peru"                            
##  [49] "Trinidad and Tobago"              "Finland"                         
##  [51] "Mexico"                           "Slovenia"                        
##  [53] "Austria"                          "Colombia"                        
##  [55] "Ecuador"                          "North Macedonia"                 
##  [57] "Norway"                           "Portugal"                        
##  [59] "South Africa"                     "Switzerland"                     
##  [61] "United Arab Emirates"             "Azerbaijan"                      
##  [63] "Belarus"                          "Bosnia and Herzegovina"          
##  [65] "China"                            "Croatia"                         
##  [67] "Hungary"                          "Indonesia"                       
##  [69] "Montenegro"                       "Nepal"                           
##  [71] "Pakistan"                         "Thailand"                        
##  [73] "Denmark"                          "India"                           
##  [75] "Kosovo"                           "Kyrgyzstan"                      
##  [77] "Philippines"                      "Romania"                         
##  [79] "Taiwan"                           "Venezuela"                       
##  [81] "Vietnam"                          "Barbados"                        
##  [83] "Scotland"                         "North Korea"                     
##  [85] "Albania"                          "Bulgaria"                        
##  [87] "Emilia-Romagna"                   "Liguria"                         
##  [89] "Lombardy"                         "Marche"                          
##  [91] "Piedmont"                         "Tuscany"                         
##  [93] "Veneto"                           "Nigeria"                         
##  [95] "Luxembourg"                       "Ghana"                           
##  [97] "Tunisia"                          "Cameroon"                        
##  [99] "Ivory Coast"                      "Kenya"                           
## [101] "Morocco"                          "Democratic Republic of the Congo"
## [103] "Uganda"                           "Burkina Faso"                    
## [105] "Cuba"                             "Czech Republic"                  
## [107] "Guinea"                           "Tanzania"                        
## [109] "DR Congo"                         "El Salvador"                     
## [111] "Qatar"                            "Malawi"                          
## [113] "Mozambique"                       "Myanmar"                         
## [115] "Cyprus"                           "Ethiopia"                        
## [117] "Iraq"                             "Paraguay"                        
## [119] "Rwanda"                           "Saudi Arabia"                    
## [121] "Uzbekistan"                       "Lebanon"                         
## [123] "Senegal"                          "Sudan"                           
## [125] "Northern Cyprus"                  "Mauritius"                       
## [127] "Oman"                             "Maldives"                        
## [129] "Bhutan"                           "Sri Lanka"                       
## [131] "Saint Lucia"                      "Afghanistan"                     
## [133] "Algeria"                          "Libya"                           
## [135] "Madagascar"                       "Faroe Islands"                   
## [137] "Greenland"                        "Fiji"                            
## [139] "Papua New Guinea"                 "Kuwait"                          
## [141] "Dominican Republic"               "Gabon"                           
## [143] "Togo"                             "Guatemala"                       
## [145] "Honduras"                         "Jordan"                          
## [147] "Namibia"
# List the countries in alphabetically order
sort(unique(covid_data$Country_Region))
##   [1] "Afghanistan"                      "Albania"                         
##   [3] "Algeria"                          "Argentina"                       
##   [5] "Armenia"                          "Australia"                       
##   [7] "Austria"                          "Azerbaijan"                      
##   [9] "Bahrain"                          "Bangladesh"                      
##  [11] "Barbados"                         "Belarus"                         
##  [13] "Belgium"                          "Bhutan"                          
##  [15] "Bolivia"                          "Bosnia and Herzegovina"          
##  [17] "Brazil"                           "Bulgaria"                        
##  [19] "Burkina Faso"                     "Cameroon"                        
##  [21] "Canada"                           "Chile"                           
##  [23] "China"                            "Colombia"                        
##  [25] "Costa Rica"                       "Croatia"                         
##  [27] "Cuba"                             "Cyprus"                          
##  [29] "Czech Republic"                   "Czechia"                         
##  [31] "Democratic Republic of the Congo" "Denmark"                         
##  [33] "Dominican Republic"               "DR Congo"                        
##  [35] "Ecuador"                          "Egypt"                           
##  [37] "El Salvador"                      "Emilia-Romagna"                  
##  [39] "Estonia"                          "Ethiopia"                        
##  [41] "Faroe Islands"                    "Fiji"                            
##  [43] "Finland"                          "France"                          
##  [45] "Gabon"                            "Germany"                         
##  [47] "Ghana"                            "Greece"                          
##  [49] "Greenland"                        "Grenada"                         
##  [51] "Guatemala"                        "Guinea"                          
##  [53] "Honduras"                         "Hungary"                         
##  [55] "Iceland"                          "India"                           
##  [57] "Indonesia"                        "Iran"                            
##  [59] "Iraq"                             "Ireland"                         
##  [61] "Israel"                           "Italy"                           
##  [63] "Ivory Coast"                      "Jamaica"                         
##  [65] "Japan"                            "Jordan"                          
##  [67] "Kazakhstan"                       "Kenya"                           
##  [69] "Kosovo"                           "Kuwait"                          
##  [71] "Kyrgyzstan"                       "Latvia"                          
##  [73] "Lebanon"                          "Libya"                           
##  [75] "Liguria"                          "Lithuania"                       
##  [77] "Lombardy"                         "Luxembourg"                      
##  [79] "Madagascar"                       "Malawi"                          
##  [81] "Malaysia"                         "Maldives"                        
##  [83] "Malta"                            "Marche"                          
##  [85] "Mauritius"                        "Mexico"                          
##  [87] "Montenegro"                       "Morocco"                         
##  [89] "Mozambique"                       "Myanmar"                         
##  [91] "Namibia"                          "Nepal"                           
##  [93] "Netherlands"                      "New Zealand"                     
##  [95] "Nigeria"                          "North Korea"                     
##  [97] "North Macedonia"                  "Northern Cyprus"                 
##  [99] "Norway"                           "Oman"                            
## [101] "Pakistan"                         "Palestine"                       
## [103] "Panama"                           "Papua New Guinea"                
## [105] "Paraguay"                         "Peru"                            
## [107] "Philippines"                      "Piedmont"                        
## [109] "Poland"                           "Portugal"                        
## [111] "Qatar"                            "Romania"                         
## [113] "Russia"                           "Rwanda"                          
## [115] "Saint Lucia"                      "Saudi Arabia"                    
## [117] "Scotland"                         "Senegal"                         
## [119] "Serbia"                           "Singapore"                       
## [121] "Slovakia"                         "Slovenia"                        
## [123] "South Africa"                     "South Korea"                     
## [125] "Spain"                            "Sri Lanka"                       
## [127] "Sudan"                            "Sweden"                          
## [129] "Switzerland"                      "Taiwan"                          
## [131] "Tanzania"                         "Thailand"                        
## [133] "Togo"                             "Trinidad and Tobago"             
## [135] "Tunisia"                          "Turkey"                          
## [137] "Tuscany"                          "Uganda"                          
## [139] "Ukraine"                          "United Arab Emirates"            
## [141] "United Kingdom"                   "United States"                   
## [143] "Uruguay"                          "Uzbekistan"                      
## [145] "Veneto"                           "Venezuela"                       
## [147] "Vietnam"
# Get dimensions of the data frame
dim(covid_data)
## [1] 27641    12
nrow(covid_data)
## [1] 27641
ncol(covid_data)
## [1] 12
str(covid_data)
## 'data.frame':    27641 obs. of  12 variables:
##  $ Date            : chr  "2020-01-16" "2020-01-17" "2020-01-18" "2020-01-20" ...
##  $ Country_Region  : chr  "Iceland" "Iceland" "Iceland" "South Korea" ...
##  $ Province_State  : chr  "All States" "All States" "All States" "All States" ...
##  $ positive        : int  3 4 7 1 0 0 0 0 0 0 ...
##  $ active          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hospitalized    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hospitalizedCurr: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ recovered       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ death           : int  NA NA NA NA 0 0 0 0 0 0 ...
##  $ total_tested    : num  NA NA NA 4 0 0 0 0 0 0 ...
##  $ daily_tested    : int  NA NA NA NA NA NA NA 0 0 0 ...
##  $ daily_positive  : int  NA 1 3 NA NA NA NA 0 0 0 ...

The data was then filtered to only get rows involving the entire country. Additionally, the names of hte column were changed, and the NA values were replaced with 0.

# Filter data to only get rows where it involves the entire country
country_covid_data <- covid_data %>% filter(Province_State == "All States")

# Remove province column
country_covid_data <- country_covid_data %>% select(-Province_State)
head(country_covid_data)
##         Date Country_Region positive active hospitalized hospitalizedCurr
## 1 2020-01-16        Iceland        3     NA           NA               NA
## 2 2020-01-17        Iceland        4     NA           NA               NA
## 3 2020-01-18        Iceland        7     NA           NA               NA
## 4 2020-01-20    South Korea        1     NA           NA               NA
## 5 2020-01-22  United States        0     NA           NA               NA
## 6 2020-01-23  United States        0     NA           NA               NA
##   recovered death total_tested daily_tested daily_positive
## 1        NA    NA           NA           NA             NA
## 2        NA    NA           NA           NA              1
## 3        NA    NA           NA           NA              3
## 4        NA    NA            4           NA             NA
## 5        NA     0            0           NA             NA
## 6        NA     0            0            0              0
# Change column names
colnames(country_covid_data) <- c("Date", "Country", "Positve", "Active",
                                  "Hospitalized", "Hospitalized_Currently",
                                  "Recovered", "Death", "Total_Tested",
                                  "Daily_Tested", "Daily_Positive")


# Replace NA values with 0
country_covid_data <- country_covid_data %>% replace(is.na(.), 0)
head(country_covid_data)
##         Date       Country Positve Active Hospitalized Hospitalized_Currently
## 1 2020-01-16       Iceland       3      0            0                      0
## 2 2020-01-17       Iceland       4      0            0                      0
## 3 2020-01-18       Iceland       7      0            0                      0
## 4 2020-01-20   South Korea       1      0            0                      0
## 5 2020-01-22 United States       0      0            0                      0
## 6 2020-01-23 United States       0      0            0                      0
##   Recovered Death Total_Tested Daily_Tested Daily_Positive
## 1         0     0            0            0              0
## 2         0     0            0            0              1
## 3         0     0            0            0              3
## 4         0     0            4            0              0
## 5         0     0            0            0              0
## 6         0     0            0            0              0

The data for only Canada was retrieved from the original data frame to be used to create different types of graphs.

# Filter data to only get the data for Canada
canada_data <- country_covid_data %>% filter(Country == "Canada")
canada_data <- canada_data %>% select(-Country)
canada_data$Date <- as.Date(canada_data$Date)

glimpse(canada_data)
## Rows: 258
## Columns: 10
## $ Date                   <date> 2020-01-31, 2020-02-08, 2020-02-16, 2020-02-21…
## $ Positve                <dbl> 4, 7, 8, 9, 10, 11, 12, 13, 15, 24, 33, 45, 51,…
## $ Active                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Hospitalized           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Hospitalized_Currently <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Recovered              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Death                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ Total_Tested           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Daily_Tested           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Daily_Positive         <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4, 5, 6, 6, 0, 15…

First, the plot() function was used to create a line graph of daily positive cases by date.

# Create line graph of daily cases by date
plot(canada_data$Date, canada_data$Daily_Positive, type = "l", lwd = 2,
     main = "Graph of Canada Daily Positive COVID Cases", xlab = "Date",
     ylab = "Number of Cases")

Next, using the filtered Canada Data, a ggplot2 line graph was created to display active cases by day.

# Use ggplot2 to create line graph of active cases by day
ggplot(canada_data, aes(x = Date, y = Active)) + geom_point(color = "blue") +
  labs(title = "Canada: Active COVID-19 Cases")

Lastly, a plotly bar graph was created by show deaths by day.

# Create bar graph of deaths by day using plotly
canada_bar_graph <- plot_ly(canada_data, x = ~Date, y = ~Death, type = "bar",
                            marker = list(color = "red"))

canada_bar_graph <- canada_bar_graph %>% layout(title = "Canada COVID Deaths",
                                                bargrap = 0.2)

canada_bar_graph
## Warning: 'layout' objects don't have these attributes: 'bargrap'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

The data was furthered summarized by total cases, tests, and hospitalizations by country.

# Summarize data by total tests, cases, hospitalizations by country
country_daily_summary <- country_covid_data %>%
  group_by(Country) %>%
  summarise(Tested = sum(Daily_Tested), Positive = sum(Daily_Positive),
            Hospitalized = sum(Hospitalized_Currently))

country_daily_summary <- country_daily_summary %>% arrange((desc(Positive)))
country_daily_summary
## # A tibble: 146 × 4
##    Country           Tested Positive Hospitalized
##    <chr>              <dbl>    <dbl>        <dbl>
##  1 United States  136937092  9850413            0
##  2 Italy           17370389   934875      2401146
##  3 Russia          11319603   432269            0
##  4 Bangladesh       2442470   420235            0
##  5 Czechia          2557224   411220            0
##  6 Canada           9873530   259992            0
##  7 Turkey           4351655   221499            0
##  8 United Kingdom   1460486   163418            0
##  9 Costa Rica        320327   116361        56929
## 10 Armenia           438837   106424      1768081
## # ℹ 136 more rows

Afterwards, to complete the analysis, the top three countries by total positive cases, tests, hospitalizations and positive test rate were determined and stored in a matrix.

# Get top three countries by positive cases
most_positive <- country_daily_summary
most_positive <- head(most_positive, 3)
top_positive_countries <- most_positive$Country
top_positive_countries
## [1] "United States" "Italy"         "Russia"
# Get top three countries by total tests
most_tested <- country_daily_summary %>% arrange((desc(Tested)))
most_tested <- head(most_tested, 3)
top_tested_countries <- most_tested$Country
top_tested_countries
## [1] "United States" "India"         "Italy"
# Get top three countries by hospitalizations
most_hospitalized <- country_daily_summary %>% arrange((desc(Hospitalized)))
most_hospitalized <- head(most_hospitalized, 3)
top_hospitalized_countries <- most_hospitalized$Country
top_hospitalized_countries
## [1] "Italy"     "Armenia"   "Singapore"
# Calculate testing rate
country_daily_summary$Rate <- country_daily_summary$Positive /
  country_daily_summary$Tested
head(country_daily_summary)
## # A tibble: 6 × 5
##   Country          Tested Positive Hospitalized   Rate
##   <chr>             <dbl>    <dbl>        <dbl>  <dbl>
## 1 United States 136937092  9850413            0 0.0719
## 2 Italy          17370389   934875      2401146 0.0538
## 3 Russia         11319603   432269            0 0.0382
## 4 Bangladesh      2442470   420235            0 0.172 
## 5 Czechia         2557224   411220            0 0.161 
## 6 Canada          9873530   259992            0 0.0263
# Get top three countries with highest positive test rate
high_rate <- country_daily_summary %>% arrange((desc(Rate)))
high_rate <- head(high_rate, 3)
highest_rate_countries <- high_rate$Country
highest_rate_countries
## [1] "Iceland"    "Costa Rica" "Scotland"
# Create matrix with the leaders in each category
covid_leaders <- rbind(top_positive_countries, top_tested_countries,
                       top_hospitalized_countries, highest_rate_countries)
covid_leaders
##                            [,1]            [,2]         [,3]       
## top_positive_countries     "United States" "Italy"      "Russia"   
## top_tested_countries       "United States" "India"      "Italy"    
## top_hospitalized_countries "Italy"         "Armenia"    "Singapore"
## highest_rate_countries     "Iceland"       "Costa Rica" "Scotland"